{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/\n",
    "import numpy\n",
    "import pandas\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from keras.utils import np_utils\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import matthews_corrcoef\n",
    "from sklearn.metrics import make_scorer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize Random Number Generator\n",
    "# fix random seed for reproducibility\n",
    "seed = 7\n",
    "numpy.random.seed(seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load dataset\n",
    "# dataframe = pandas.read_csv(\"kddforpandatrain.csv\")#, header=True)\n",
    "dataframe = pandas.read_csv(\"kdd_dataset.csv\")#, header=True)  # read the whole 10% dataset into dataframe\n",
    "\n",
    "# samples 3000 random data points from 500k\n",
    "dataframe = dataframe.sample(n=100)\n",
    "\n",
    "# LabelEncoder, turns all our categorical data into integers\n",
    "le = LabelEncoder()\n",
    "\n",
    "\n",
    "# apply \"le.fit_transform\" to every column (usually only works on 1 column)\n",
    "dataframe_encoded = dataframe.apply(le.fit_transform)\n",
    "attack_labels = le.classes_\n",
    "dataset = dataframe_encoded.values\n",
    "\n",
    "#Set X as our input data and Y as our label\n",
    "X = dataset[:,0:41].astype(float)\n",
    "Y = dataset[:,41]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6\n"
     ]
    }
   ],
   "source": [
    "# encode class values as integers\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(Y)\n",
    "encoded_Y = encoder.transform(Y)\n",
    "# convert integers to dummy variables (i.e. one hot encoded)\n",
    "dummy_y = np_utils.to_categorical(encoded_Y)\n",
    "# print(dummy_y)\n",
    "print(len(dummy_y[0]))\n",
    "num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample\n",
    "# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample\n",
    "# we need to set output layer to be equal to the length of our dummy_y vectors\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define baseline model\n",
    "def baseline_model():\n",
    "    # create model\n",
    "    model = Sequential()\n",
    "    \n",
    "    inputs = 41\n",
    "    hidden_layer1 = 18\n",
    "    hidden_layer2 = 6\n",
    "    hidden_layer3 = 0\n",
    "    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc\n",
    "    \n",
    "    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))\n",
    "    if hidden_layer2 != 0:\n",
    "        model.add(Dense(hidden_layer2, activation='relu'))\n",
    "    if hidden_layer3 != 0:\n",
    "        model.add(Dense(hidden_layer3, activation='relu'))\n",
    "    model.add(Dense(outputs, activation='softmax'))\n",
    "    \n",
    "    # Compile model\n",
    "    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "matthews_scorer = make_scorer(matthews_corrcoef)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:258: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  if np.all([l not in y_true for l in labels]):\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "At least one label specified must be in y_true",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-fc94fa16bb62>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcross_val_predict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdummy_y\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkfold\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattack_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py\u001b[0m in \u001b[0;36mconfusion_matrix\u001b[0;34m(y_true, y_pred, labels, sample_weight)\u001b[0m\n\u001b[1;32m    257\u001b[0m         \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    258\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0ml\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 259\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"At least one label specified must be in y_true\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    261\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: At least one label specified must be in y_true"
     ]
    }
   ],
   "source": [
    "\n",
    "estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)\n",
    "\n",
    "kfold = KFold(n_splits=5, shuffle=True, random_state=seed)\n",
    "y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)\n",
    "\n",
    "cm = confusion_matrix(Y, y_pred, labels=attack_labels)\n",
    "print(cm)\n",
    "\n",
    "#put calculations here \n",
    "print(\"total: \" + str(cm.sum()))\n",
    "print(\"accuracy: \" + str(numpy.trace(cm) / cm.sum()))\n",
    "\n",
    "# print(\"Baseline: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:anaconda3]",
   "language": "python",
   "name": "conda-env-anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
